{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# RAG With llama-index  + Milvus + LLama\n",
    "\n",
    "References\n",
    "- https://docs.llamaindex.ai/en/stable/examples/vector_stores/MilvusIndexDemo/\n",
    "- https://docs.llamaindex.ai/en/stable/api_reference/storage/vector_store/milvus/?h=milvusvectorstore#llama_index.vector_stores.milvus.MilvusVectorStore"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step-1: Configuration"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "from my_config import MY_CONFIG\n",
    "\n",
    "MY_CONFIG.DB_URI = './rag_2_llamaindex.db'\n",
    "MY_CONFIG.COLLECTION_NAME = 'llamaindex_papers'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step-2: Read documents"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loaded 71 chunks\n",
      "CPU times: user 1.72 s, sys: 22.6 ms, total: 1.74 s\n",
      "Wall time: 1.8 s\n"
     ]
    }
   ],
   "source": [
    "%%time \n",
    "\n",
    "from llama_index.core import SimpleDirectoryReader\n",
    "import pprint\n",
    "\n",
    "# load documents\n",
    "documents = SimpleDirectoryReader(\n",
    "    # input_dir = './data/10k/input/'\n",
    "    input_dir = MY_CONFIG.INPUT_DATA_DIR\n",
    ").load_data()\n",
    "\n",
    "print (f\"Loaded {len(documents)} chunks\")\n",
    "\n",
    "# print(\"Document [0].doc_id:\", documents[0].doc_id)\n",
    "# pprint.pprint (documents[0], indent=4)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step-3: Setup Embedding Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# If connection to https://huggingface.co/ failed, uncomment the following path\n",
    "import os\n",
    "os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "from llama_index.embeddings.huggingface import HuggingFaceEmbedding\n",
    "from llama_index.core import Settings\n",
    "\n",
    "Settings.embed_model = HuggingFaceEmbedding(\n",
    "    model_name = MY_CONFIG.EMBEDDING_MODEL\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step-4: Connect to Milvus"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "✅ Connected to Milvus instance:  ./rag_2_llamaindex.db\n"
     ]
    }
   ],
   "source": [
    "from pymilvus import MilvusClient\n",
    "\n",
    "milvus_client = MilvusClient(MY_CONFIG.DB_URI)\n",
    "print (\"✅ Connected to Milvus instance: \", MY_CONFIG.DB_URI )\n",
    "\n",
    "# if we already have a collection, clear it first\n",
    "if milvus_client.has_collection(collection_name = MY_CONFIG.COLLECTION_NAME):\n",
    "    milvus_client.drop_collection(collection_name = MY_CONFIG.COLLECTION_NAME)\n",
    "    print ('✅ Cleared collection :', MY_CONFIG.COLLECTION_NAME)\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "✅ Connected Llama-index to Milvus instance:  ./rag_2_llamaindex.db\n"
     ]
    }
   ],
   "source": [
    "# connect to vector db\n",
    "from llama_index.core import VectorStoreIndex, StorageContext\n",
    "from llama_index.vector_stores.milvus import MilvusVectorStore\n",
    "\n",
    "vector_store = MilvusVectorStore(\n",
    "    uri = MY_CONFIG.DB_URI ,\n",
    "    dim = MY_CONFIG.EMBEDDING_LENGTH , \n",
    "    collection_name = MY_CONFIG.COLLECTION_NAME,\n",
    "    overwrite=True\n",
    ")\n",
    "storage_context = StorageContext.from_defaults(vector_store=vector_store)\n",
    "\n",
    "print (\"✅ Connected Llama-index to Milvus instance: \", MY_CONFIG.DB_URI )"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step-5: Create Index and Save to DB"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "✅ Created index: <llama_index.core.indices.vector_store.base.VectorStoreIndex object at 0x7d60c42570d0>\n",
      "✅ Saved index to db  ./rag_2_llamaindex.db\n",
      "CPU times: user 729 ms, sys: 143 ms, total: 872 ms\n",
      "Wall time: 729 ms\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "# create an index\n",
    "\n",
    "from llama_index.core import VectorStoreIndex\n",
    "\n",
    "index = VectorStoreIndex.from_documents(\n",
    "    documents, storage_context=storage_context\n",
    ")\n",
    "print (\"✅ Created index:\", index )\n",
    "print (\"✅ Saved index to db \", MY_CONFIG.DB_URI )"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "dpk-4-rag-pdf-r1.1.0-py3.11",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
